Go through all of the citations in the citation_data directory, and extract from each one the list of authors. Save the authors into the citations.h5 database.


In [1]:
import pandas as pd
import json
import os
import sys
from unidecode import unidecode
from IPython.display import clear_output

In [2]:
# open the HDF5 database
store = pd.HDFStore("citations.h5", mode='w')

for filename in sorted(os.listdir("citation_data")):
    #store_name = 'dois/{}'.format(os.path.splitext(filename)[0])
    # skip citations that are already in the database
    #if store_name in store:
    #    continue

    # read the citation data
    with open(os.path.join("citation_data", filename), 'r') as fh:
        # skip citations that are malformed
        try:
            citation = json.load(fh)
        except:
            continue

    # skip citations that don't have author information
    try:
        citation['bibliographic']['author']
    except (TypeError, KeyError):
        continue

    # get the doi and the author information
    doi = citation['bibliographic']['DOI']
    df = pd.DataFrame(citation['bibliographic']['author']).dropna(how='all', axis=0)
    df['doi'] = doi

    # construct the place we will store it in the database (we need a fancy
    # naming scheme because otherwise we have 10000 objects all under the same
    # group -- we should take advantage of HDF5's hierarchical nature). Also
    # we have to prefix the numbers with underscores othewise the names are not
    # considered "natural" -- i.e. they are not valid python identifiers.
    prefix = "_".join(doi.split(".")[:-1])
    number = doi.split(".")[-1]
    number_path = "/_".join(list(str(number))[:-2])
    store_path = "/dois/_{}/_{}/_{}".format(prefix, number_path, number)

    # print progress
    clear_output()
    print("Saving '{}' to '{}'".format(filename, store_path))
    sys.stdout.flush()

    # save into the database
    store.put(store_path, df)

# close the HDF5 database
store.close()


Saving 'doi_12257.json' to '/dois/_10_1371/journal_pone/_0/_1/_1/_0/_2/_0110224'

In [3]:
# check the size of the database
!ls -lh citations.h5


-rw-rw-r-- 1 jhamrick jhamrick 9.8G Oct 19 13:11 citations.h5

In [4]:
# recompress the database so it's not quite so gigantic
!ptrepack --chunkshape=auto --propindexes --complevel=9 --complib=blosc citations.h5 citations-small.h5

In [5]:
# rename the compressed database
!mv citations-small.h5 citations.h5

In [6]:
# check the size of the database
!ls -lh citations.h5


-rw-rw-r-- 1 jhamrick jhamrick 120M Oct 19 13:15 citations.h5